Introduction

The current experiment is a cell multiplex experiment involving EpCam+ sorted mouse Thymic Epithelial Fibroblast, where 6 samples were hash tagged using Totalseq A HashTag antibodies and combined together for a single run in 10x chromium. The GEX was preprocessed with cellranger 8.0 and the CiteSeq data was preprocessed using Cite-Seq-Count. Both analysis yielded matrix files that is being used to create a Seurat object.

Setup the working environment

library(dplyr)
library(Seurat)
library(sctransform)
library(patchwork)
library(ggplot2)
library(glmGamPoi)
library(lme4)
library(magrittr)
library(scCustomize)
library(presto)
library(edgeR)
library(matrixStats)
library(tibble)
library(ggrepel)
lapply(c("HGNChelper","openxlsx"), library, character.only = T)
## [[1]]
##  [1] "HGNChelper"   "ggrepel"      "tibble"       "matrixStats"  "edgeR"       
##  [6] "limma"        "presto"       "data.table"   "Rcpp"         "scCustomize" 
## [11] "magrittr"     "lme4"         "Matrix"       "glmGamPoi"    "ggplot2"     
## [16] "patchwork"    "sctransform"  "Seurat"       "SeuratObject" "sp"          
## [21] "dplyr"        "stats"        "graphics"     "grDevices"    "utils"       
## [26] "datasets"     "methods"      "base"        
## 
## [[2]]
##  [1] "openxlsx"     "HGNChelper"   "ggrepel"      "tibble"       "matrixStats" 
##  [6] "edgeR"        "limma"        "presto"       "data.table"   "Rcpp"        
## [11] "scCustomize"  "magrittr"     "lme4"         "Matrix"       "glmGamPoi"   
## [16] "ggplot2"      "patchwork"    "sctransform"  "Seurat"       "SeuratObject"
## [21] "sp"           "dplyr"        "stats"        "graphics"     "grDevices"   
## [26] "utils"        "datasets"     "methods"      "base"
library(SCpubr)
library(decoupleR)
library(ReactomeGSA)

Load the data and preprocess to have the same barcodes in cells and

gex_data<-Read10X("filtered_feature_bc_matrix/")
hto_data<-Read10X("umi_count/", gene.column=1)
hto_data <- hto_data[-c(7),]
current_col_names <- colnames(hto_data)
new_col_names <- paste0(current_col_names, "-1")
colnames(hto_data) <- new_col_names
current_row_names <- rownames(hto_data)
new_row_names <- gsub("-.*", "", current_row_names)
rownames(hto_data) <- new_row_names
dim(hto_data)
## [1]    6 9116
dim(gex_data)
## [1] 33696  9116
joint.bcs <- intersect(colnames(gex_data), colnames(hto_data))
gex_data <- gex_data[, joint.bcs]
hto_data <- as.matrix(hto_data[, joint.bcs])

# remove the variable unmapped from HTO object

dim(hto_data)
## [1]    6 8856
dim(gex_data)
## [1] 33696  8856

Create Seurat Object

Creating a seurat object with the gene expression data and adding the HTO data as a seperate assay

data <- CreateSeuratObject(counts = gex_data,
                           project = "Jason data",  #Name this whatever.
                           names.delim = NULL)  # Don't try and parse the sample names

HTO <- CreateAssay5Object(counts = hto_data)
## Warning: Data is of class matrix. Coercing to dgCMatrix.
data[["HTO"]] <- HTO

Filter poor quality or uninteresting cells

#Assess percent of mitochondrial counts in each cell 
data[["percent_mt"]] <- PercentageFeatureSet(object = data, 
                                             pattern = "^mt-")

#Violin plot
VlnPlot(object = data, 
        features = c("nFeature_RNA", 
                   "nCount_RNA", 
                   "percent_mt"), 
        ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

#Other plotting otions
plot1 <- FeatureScatter(object = data, 
                        feature1 = "nCount_RNA", 
                        feature2 = "percent_mt")
plot2 <- FeatureScatter(object = data,
                        feature1 = "nCount_RNA", 
                        feature2 = "nFeature_RNA")
plot1 + plot2

#Subset data
data <- subset(x = data, 
               subset =  nFeature_RNA > 200 & 
                 nCount_RNA > 5000 & 
                 percent_mt < 15)

VlnPlot(object = data, 
        features = c("nFeature_RNA",
                   "nCount_RNA",
                   "percent_mt"),
        ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

SCTransform normalization

options(future.globals.maxSize = 1024^3 * 4)  # Set to 4 GiB
data <- SCTransform(data, method= "glmGamPoi", verbose = FALSE)

Normalize and demultiplex HTO data

data <- NormalizeData(data, assay = "HTO", normalization.method = "CLR")
## Normalizing layer: counts
## Normalizing across features
data <- HTODemux(data, assay = "HTO", positive.quantile = 0.99)
## As of Seurat v5, we recommend using AggregateExpression to perform pseudo-bulk analysis.
## This message is displayed once per session.
## First group.by variable `ident` starts with a number, appending `g` to ensure valid variable names
## Cutoff for KO1 : 15 reads
## 
## Cutoff for KO2 : 11 reads
## 
## Cutoff for WT1 : 51 reads
## 
## Cutoff for KO3 : 12 reads
## 
## Cutoff for WT2 : 31 reads
## 
## Cutoff for WT3 : 95 reads
## 
## This message is displayed once every 8 hours.
table(data$HTO_classification.global)
## 
##  Doublet Negative  Singlet 
##      853        4     6688
Idents(data) <- "HTO_classification.global"
VlnPlot(data, features = "nCount_RNA", pt.size = 0.1, log = TRUE)

##Visualize HTO data

DefaultAssay(data) <- "HTO"
data <- ScaleData(data, features = rownames(data),
    verbose = FALSE)
data <- RunPCA(data, features = rownames(data), approx = FALSE)
## Warning: Requested number is larger than the number of available items (6).
## Setting to 6.
## Warning: Requested number is larger than the number of available items (6).
## Setting to 6.
## Warning: Requested number is larger than the number of available items (6).
## Setting to 6.
## Warning: Requested number is larger than the number of available items (6).
## Setting to 6.
## Warning: Requested number is larger than the number of available items (6).
## Setting to 6.
## PC_ 1 
## Positive:  KO1, KO2, WT1 
## Negative:  WT3, WT2, KO3 
## PC_ 2 
## Positive:  KO3, WT2, WT1 
## Negative:  WT3, KO1, KO2 
## PC_ 3 
## Positive:  WT2, KO2, KO1 
## Negative:  WT1, KO3, WT3 
## PC_ 4 
## Positive:  WT1, WT2, KO1 
## Negative:  KO3, WT3, KO2 
## PC_ 5 
## Positive:  KO2, WT1, KO3 
## Negative:  KO1, WT2, WT3
datat <- RunTSNE(data, dims = 1:8, perplexity = 100)
## Warning in `[[.DimReduc`(args$object, cells, args$dims): The following
## embeddings are not present: NA, NA
DimPlot(data)

##Extract singlet data and perform further processing

singlets <- subset(data, idents = "Singlet")

Dimensionality reduction

Linear dimensionality reduction using PCA

DefaultAssay(singlets) <- "SCT"
# Use the highly variable genes to find principal components
singlets <- RunPCA(object = singlets, verbose = FALSE)  
## Warning: Number of dimensions changing from 6 to 50
#Examine and visualize PCA results a few different ways
print(x = singlets[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  Ctsl, Prss16, Ccl25, Tbata, Cdh4 
## Negative:  Nrg1, Syt1, Srgn, Dpp10, Cd52 
## PC_ 2 
## Positive:  Ccl25, Tbata, Prss16, Fabp5, Fgf14 
## Negative:  Ccl21a, Kcnq3, Gm36660, Csmd1, Krt5 
## PC_ 3 
## Positive:  Grid1, Dpp10, Top2a, Hmgb2, H2ac24 
## Negative:  AW112010, Fam3d, Atp1b1, Reg3g, Ly6a 
## PC_ 4 
## Positive:  Ccl21a, Lifr, S100a8, Nrg1, Kcnq3 
## Negative:  Top2a, Hmgb2, H2ac24, H1f5, Mki67 
## PC_ 5 
## Positive:  Gpc5, Iigp1, S100a8, Top2a, Arhgap6 
## Negative:  Ccl25, Tbata, Wfdc18, Ccl21a, Syt1
DimPlot(singlets, reduction = "pca")

ElbowPlot(singlets)

VizDimLoadings(object = singlets, dims = 1:2, reduction = "pca")

DimPlot(singlets, reduction = "pca", group.by = "HTO_classification")

Nonlinear dimensionality reduction for visualization using UMAP

singlets <- RunUMAP(singlets, dims = 1:10, verbose = FALSE)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
DimPlot(singlets, reduction = "umap")

DimPlot(singlets, reduction = "umap", group.by = "HTO_classification")

p1 <- SCpubr::do_DimPlot(sample = singlets,
                        label = FALSE,
                        group.by = "HTO_classification",
                        raster = FALSE,
                        repel=TRUE, legend.position = "bottom")

Clustering

singlets <- FindNeighbors(singlets, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
singlets <- FindClusters(singlets, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6688
## Number of edges: 209140
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8921
## Number of communities: 11
## Elapsed time: 0 seconds
DimPlot(singlets, reduction = "umap", label = TRUE) 

DimPlot(singlets, reduction = "pca", label = TRUE)

p2 <- SCpubr::do_DimPlot(sample = singlets,
                        label = FALSE,
                        raster = FALSE,
                        repel=TRUE, legend.position = "bottom")

##Cell Annotation

Find markers

# find markers for every cluster compared to all remaining cells, report only the positive ones
markers <- FindAllMarkers(singlets, 
                               only.pos = TRUE, 
                               min.pct = 0.25, 
                               logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
markers %>%
  group_by(., cluster) %>% 
  top_n(., n = 10, wt = avg_log2FC) -> top10

DoHeatmap(singlets, features = top10$gene) + NoLegend()
## Warning in DoHeatmap(singlets, features = top10$gene): The following features
## were omitted as they were not found in the scale.data slot for the SCT assay:
## Gm6961, Dgkeos, Trdc, Rag1, Ptprcap, Sh2d1a, Cd8b1, Cd8a, Col6a2, Gfra2, C1s1,
## Tnfsf10, Sult5a1, Pif1, H2bc14, H3c2, H2ac10, H2ac4, Gm13944, Gm15261, Etv5,
## Slc38a4, Scn1a, Ackr4, Adhfe1, Klf15, Tepp, Col8a2, Gm15398, 1700024B18Rik,
## Frem2, Scx, D630003M21Rik, Kcna2, Wnt10b, Tvp23a, Cdh22, A630023P12Rik,
## Gm29481, Dusp28

Evaluating WT vs KO distribution in UMAP

sample_names <- gsub("[0-9]", "", singlets$HTO_classification)
phenotype <- ifelse(grepl("WT", sample_names), "WT", "KO")
singlets$phenotype <- factor(phenotype)
DimPlot_scCustom(singlets, reduction = "umap", split.by = "phenotype")&theme_bw() & theme()
## 
## NOTE: DimPlot_scCustom returns split plots as layout of all plots each
## with their own axes as opposed to Seurat which returns with shared x or y axis.
## To return to Seurat behvaior set `split_seurat = TRUE`.
## 
## -----This message will be shown once per session.-----

p3 <- SCpubr::do_DimPlot(sample = singlets,
                        label = FALSE,
                        split.by = "phenotype",
                        raster = FALSE,
                        repel=TRUE, legend.position = "bottom")
genes_list_1 <- c("Psmb11", "Dll4", "Ccl25", "Enpep", "Ly75", "Cd83") 
genes_list_2 <- c("Prss16", "Ackr4", "Aire", "Fezf2", "Cd80", "Krt5") 
genes_list_3 <- c("Itga6", "Ccl21a", "Ccl19", "Irga2", "Krt10", "Pigr") 
genes_list_4 <- c("H2-Aa", "Cldn3", "Cldn4", "Tnfrsf11a", "Epcam")
FeaturePlot_scCustom(singlets, genes_list_1 ) + plot_annotation(title = "Gene list 1", theme = theme(plot.title = element_text(size = 20, hjust = 0.5)))&theme_bw() & theme()
## 
## NOTE: FeaturePlot_scCustom uses a specified `na_cutoff` when plotting to
## color cells with no expression as background color separate from color scale.
## Please ensure `na_cutoff` value is appropriate for feature being plotted.
## Default setting is appropriate for use when plotting from 'RNA' assay.
## When `na_cutoff` not appropriate (e.g., module scores) set to NULL to
## plot all cells in gradient color palette.
## 
## -----This message will be shown once per session.-----

FeaturePlot_scCustom(singlets, genes_list_2 ) + plot_annotation(title = "Gene list 2", theme = theme(plot.title = element_text(size = 20, hjust = 0.5)))&theme_bw() & theme()

FeaturePlot_scCustom(singlets, genes_list_3 ) + plot_annotation(title = "Gene list 3", theme = theme(plot.title = element_text(size = 20, hjust = 0.5)))&theme_bw() & theme()
## Warning: The following features were omitted as they were not found:
## ℹ Irga2

FeaturePlot_scCustom(singlets, genes_list_4 ) + plot_annotation(title = "Gene list 4", theme = theme(plot.title = element_text(size = 20, hjust = 0.5)))&theme_bw() & theme()

Performing supervised annoation with gene signatures from literature

Previously Campinoti et al., 2020 published a detailed cellular overview of the human postnatal thymus. We took the gene signatures corresponding to mTEC, cTEC and comTEC and applied to our dataset to classify TECs.

early_progenitor_gene_list <- list(c("Ackr4", "Adamts10", "Agrn", "Aldh2", "Aldh6a1", "Amotl1", "Amotl2", "Antxr1", "Apoe",
               "Ar", "Bcam", "Bcl11a", "Bcl2", "Bmp4", "Btg2", "Cbx6", "Ccdc80", "Cdh11", "Cldn8", 
               "Clec11a", "Clstn1", "Col18a1", "Cpne8", "Cthrc1", "Cyp1b1", "Dcn", "Ddr1", "Dhrs3", 
               "Dlk2", "Dnajc13", "Dpp6", "Dpysl2", "Dsc3", "Egr1", "Eid1", 
               "Fkbp9", "Fmod", "Fos", "Fosb", "Frmd6", "Fstl1", "Ogn", "Gas1", "Pak3", "Gbp2", 
               "Palld", "Gnaq", "Pdpn", "Gpm6b", "Penk", "Gprasp1", "Plxdc2", "Gstm2", "Pmp22", 
               "H2-DMa", "Prelp", "Hes6", "Prrg3", "Hic1", "Prss23", "Hsd17b10", "Ptprz1", "Igfbp2", 
               "Igfbp3", "Pygb", "Igfbp5", "Rbp1", "Igfbp7", "Rnase4", "Iigp1", "Scn1a", 
               "Il33", "Serpinf1", "Irgm1", "Serpinh1", "Isl1", "Shisa2", "Itm2c", "Slc2a13", "Kazald1", 
               "Sord", "Lamb1", "Sparc", "Laptm4a", "Spon2", "Limch1", "Spry1", "Ltbp3", "Tcn2", 
               "Maged1", "Tgfbr2", "Megf6", "Tgfbr3", "Meis1", "Thbd", "Mgll", "Thbs1", "Mgp", 
               "Timp2", "Myl9", "Tinagl1", "Mylk", "Tnfrsf19", "Nbl1", "Tns1", "Nell2", "Tns3", 
               "Nfia", "Trim29", "Nfib", "Trp63", "Nfix", "Tspan9", "Nr2f1", "Twsg1", "Nr4a1", 
               "Txnip", "Nrtn", "Unc119", "Ntrk3", "Vmac", "Oat", "Wls", "Wscd1", "Xdh", "Zfp36"))

postnatal_progenitors_gene_list <- list(c("Acta2", "Apoe", "Ascl1", "Boc", "C1s1", "C3", "Cald1", "Ccl11", "Ccl21a", "Clca3a1", 
               "Col6a1", "Col6a2", "Ddx60", "Dpysl3", "Dst", "Emp2", "Flna", 
               "Fst", "Fzd2", "Gas1", "Glul", "Gpx3", "Gsn", "Hpgd", "Htra1", "Id1", "Ifi27l2a", 
               "Igfbp4", "Igfbp5", "Irf7", "Isg15", "Itga6", "Itgb4", "Krt14", "Krt5", "Krt7", 
               "Lamb3", "Lars2", "Lifr", "Mgp", "Myl9", "Nrbp2", "S1pr3", "Slc4a11", "Sox4", "Stat2", 
               "Sult5a1", "Tagln", "Tgfbi", "Tpm2", "Wfikkn2"))

cTEC_gene_list <- list(c("Ccl25", "Cd274", "Cfc1", "Ctsh", "Foxn1", "Kcnip3", "Ly75", "Prss16", "Psmb11", "Scx", "Slc46a2", "Tbx1"))

mTEC_gene_list <- list(c("AA467197", "Adap1", "Adgb", "Aebp2", "Ahcyl2", "Aif1l", "Aire", "Alas1", "Ank", "Ankrd33b",
               "Anxa10", "Aoc1", "Apoa1", "Apoa4", "Apobec1", "Apoc2", "Aqp3", "Arc", "Asprv1", "Atp1a2", 
               "Atp1b1", "Atp6v1c1", "AU040320", "Avil", "AW112010", "Bcl2a1a", "Bcl2l1", "Blnk", 
               "Bmp2k", "Cadps", "Calca", "Calcb", "Casp1", "Casz1", "Ccdc184", "Ccdc88a", "Ccl17", "Ccl20", 
               "Ccl22", "Ccl27a", "Ccl5", "Ccl6", "Ccl9", "Ccr7", "Cd52", "Cd70", "Cdh17", "Cdhr5", "Cdkn1a", 
               "Cdkn1c", "Cdkn2b", "Cdx1", "Chil1", "Ckmt1", "Cldn13", "Cldn3", "Cldn4", "Cldn7", "Cnp", 
               "Cpeb4", "Crabp1", "Crhbp", "Csn2", "Cst6", "Ctrb1", "Ctsh", "Ctss", "Ctsz", "Cxcl2", "Cyba", 
               "Cybb", "Cyp2a4", "Cystm1", "Defb6", "Dgat2", "Dio1", "Dmkn", "Dmpk", "Dnase1l3", 
               "Dscaml1", "Eaf2", "Ebi3", "Ehd1", "Elf3", "Eno2", "Espn", "Etv3", "Fabp4", "Fabp6", 
               "Fabp9", "Fam25c", "Fam89a", "Fcer1g", "Fezf2", "Fgf21", "Fhad1", "Fnbp1", "Fscn1", "Gas7", 
               "Gbp4", "Gda", "Gdf15", "Gjb2", "Gnat3", "Gnb3", "Gnb4", "Gng13", "Gpa33", "Gramd4", "Grap", 
               "Grin2c", "Gstm1", "Gstt1", "Guca2b", "H2-Eb2", "H2-Oa", "Hagh", "Hal", "Hamp", "Hdc", "Hopx", 
               "Icosl", "Igf1", "Igfbp6", "Igsf8", "Il12a", "Il13", "Il1rn", "Il23a", "Il2rg", "Ing1", "Inpp5b", 
               "Insm1", "Itgb8", "Kcnk6", "Kctd12", "Klk1", "Klk1b11", "Klk7", "Krt10", "Krt16", "Krt20", "Krt77", 
               "Krt79", "Lad1", "Laptm5", "Lcp1", "Liph", "Lrrc42", "Lsp1", "Ly6i", "Lypd2", "Lypd8", "Lyz1", 
               "Malat1", "Mctp1", "Mdm2", "Me1", "Mep1a", "Mmp7", "Muc1", "Muc13", "Mxd1", 
               "Myl7", "Myo15b", "N4bp2l1", "Ncf1", "Ncf4", "Neat1", "Nfkbia", "Nos2", "Nostrin", "Nptx2", "Nsmce1", 
               "Nsmce2", "Nts", "Nup85", "Oasl1", "Ogfrl1", "Ooep", "Pcp4", "Pgc", "Pglyrp1", "Pglyrp2", 
               "Pigr", "Pip5k1b", "Pla1a", "Pla2g4a", "Plagl1", "Plb1", "Pld1", "Plekha4", "Pmaip1", "Prap1", "Prg2", 
               "Prg4", "Prokr2", "Psors1c2", "Ptgds", "Ptgs2", "Pyy", "Rab25", "Rac2", "Rap1gap", "Rarres1", "Reg3g", 
               "Resp18", "Rnasel", "Rnf128", "Rogdi", "S100a14", "S100a4", "S100a8", "S100a9", "S100g", "Saa3", "Sat1", 
               "Sel1l3", "Sema4a", "Serpinb12", "Serpinb1a", "Serpinb2", "Serpinb9", "Sgpp1", "Sh2d6", "Sh3tc2", 
               "Sirt1", "Skint2", "Skint4", "Skint9", "Slc13a2", "Slc43a3", "Wfdc21", "Slc4a8", "Slc5a8", "Slc6a20a", 
               "Slco5a1", "Smtnl1", "Sncg", "Spink1", "Spink5", "Spock3"))
singlets <- AddModuleScore(object = singlets, features = early_progenitor_gene_list, name = "earlypro_score")

singlets <- AddModuleScore(object = singlets, features = postnatal_progenitors_gene_list, name = "postnatal_score")

singlets <- AddModuleScore(object = singlets, features = mTEC_gene_list, name = "mTEC_score")

singlets <- AddModuleScore(object = singlets, features = cTEC_gene_list, name = "cTEC_score")


plot1<-FeaturePlot_scCustom(singlets, features="mTEC_score1", reduction="umap", colors_use= viridis_plasma_dark_high, na_color="lightgray", pt.size=0.5, na_cutoff = 0.5)&theme_bw() & theme()& labs(title = "mTEC signature")
## Warning: Some of the plotted features are from meta.data slot.
## • Please check that `na_cutoff` param is being set appropriately for those
##   features.
plot2<-FeaturePlot_scCustom(singlets, features="cTEC_score1", reduction="umap", colors_use= viridis_plasma_dark_high, na_color="lightgray", pt.size=0.5, na_cutoff = 0.5)&theme_bw() & theme()& labs(title = "cTEC signature")
## Warning: Some of the plotted features are from meta.data slot.
## • Please check that `na_cutoff` param is being set appropriately for those
##   features.
plot3<-FeaturePlot_scCustom(singlets, features="earlypro_score1", reduction="umap", colors_use= viridis_plasma_dark_high, na_color="lightgray", pt.size=0.5, na_cutoff = 0.5)&theme_bw() & theme()& labs(title = "Early Progenitor signature")
## Warning: Some of the plotted features are from meta.data slot.
## • Please check that `na_cutoff` param is being set appropriately for those
##   features.
plot4<-FeaturePlot_scCustom(singlets, features="postnatal_score1", reduction="umap", colors_use= viridis_plasma_dark_high, na_color="lightgray", pt.size=0.5, na_cutoff = 0.5)&theme_bw() & theme()& labs(title = "Postnatal Progenitor signature")
## Warning: Some of the plotted features are from meta.data slot.
## • Please check that `na_cutoff` param is being set appropriately for those
##   features.
plot1|plot2|plot3|plot4

#Classifying cluster 3 and 7

cluster3.markers <- FindMarkers(singlets, 
                                ident.1 = 3, 
                                min.pct = 0.25)
head(cluster3.markers, n = 10)
##                 p_val avg_log2FC pct.1 pct.2     p_val_adj
## Tmem158  0.000000e+00   2.815598 0.921 0.333  0.000000e+00
## Utf1    1.027280e-300   2.533964 0.888 0.274 2.857174e-296
## Icosl   9.957734e-300   2.349307 0.918 0.284 2.769545e-295
## Cdx1    2.345691e-287   2.443963 0.884 0.278 6.524070e-283
## Ikzf1   1.270043e-268   2.340663 0.918 0.299 3.532370e-264
## Cdh22   1.274582e-264   3.156383 0.598 0.104 3.544995e-260
## Hnf1a   6.090882e-259   2.521828 0.758 0.196 1.694057e-254
## Slc4a8  7.881342e-246   2.591595 0.766 0.211 2.192038e-241
## Snx8    3.261791e-240   2.073517 0.895 0.341 9.072020e-236
## Tvp23a  2.511083e-237   2.621406 0.696 0.174 6.984076e-233
cluster7.markers <- FindMarkers(singlets, 
                                ident.1 = 7, 
                                min.pct = 0.25)
head(cluster7.markers, n = 10)
##        p_val avg_log2FC pct.1 pct.2 p_val_adj
## Kif11      0   4.088734 0.959 0.087         0
## Nusap1     0   4.586684 0.952 0.083         0
## Pclaf      0   4.258100 0.961 0.107         0
## Cdca8      0   4.049844 0.937 0.094         0
## Birc5      0   4.042079 0.931 0.088         0
## Knl1       0   4.326299 0.905 0.066         0
## Kif4       0   3.831519 0.931 0.102         0
## Cdk1       0   4.259671 0.944 0.115         0
## Top2a      0   4.740669 0.998 0.170         0
## Kif15      0   4.161737 0.883 0.063         0

##Finalized cell annotation From the above classifications there are the inferred cell annotation Cluster 0 - mTEC3 Cluster 1 - mTEC2 Cluster 2 - mature cTEC Cluster 3 - mTEC1 Cluster 4 - Postnatal progenitor cells Cluster 5 - immature cTEC Cluster 6 - Early progenitor cells Cluster 7 - Transient amplifying progenitor Cluster 8 - Postnatal progenitor cells Cluster 9 - Residual Cluster 10 - proliferating cTEC

cell_annotation <- data.frame(cluster=0:10,
                             cell_type=c("mTEC3", "mTEC2", "mature cTEC", "mTEC1", "PNP", "immature cTEC", "Early progenitors", "Transient amplifying progenitors", "PNP", "Residual", "proliferating cTEC"))

new.cluster.ids <- c("mTEC3", "mTEC2", "mature cTEC", "mTEC1", "PNP", "immature cTEC", "Early progenitors", "Transient amplifying progenitors", "PNP", "Residual", "proliferating cTEC")
names(new.cluster.ids) <- levels(singlets)
singlets <- RenameIdents(singlets, new.cluster.ids)
singlets$cell_type <- Idents(singlets)
DimPlot_scCustom(singlets, reduction = "umap", label = TRUE, repel = TRUE)&theme_bw() & theme()

p4 <- SCpubr::do_DimPlot(sample = singlets,
                        label = FALSE,
                        raster = FALSE,
                        repel=TRUE, legend.position = "right")

p4

p1|p2|p4

interferon_markers <- c("Mx1", "Isg15", "Oas1a", "Oas1b", "Oas1g", "Oas2", "Oas3", 
                        "Rsad2", "Ifit1", "Ifit2", "Ifit3", "Ifi44", "Stat1", 
                        "Stat2", "Irf7", "Cxcl10", "H2-Ab1", "Irf1", 
                        "Cxcl9", "Ido1", "Icam1", "Ciita", "Eif2ak2", "Isg20", 
                         "Zbp1", "Bst2")
# Define mouse interferon response markers for each type
interferonI_genes <- c("Mx1", "Isg15", "Oas1a", "Oas1b", "Oas1g", "Oas2", "Oas3", "Rsad2", "Ifit1", "Ifit2", "Ifit3", "Ifi44", "Stat1", "Stat2", "Irf7", "Cxcl10")
interferonII_genes <- c("H2-Ab1", "Irf1", "Cxcl9", "Cxcl10", "Ido1", "Icam1", "Ciita")
interferonIII_genes <- c("Il10rb", "Mx1", "Oas1a", "Oas1b", "Oas1g", "Oas2", "Oas3", "Ifit1", "Ifit2", "Ifit3")

singlets <- AddModuleScore(singlets, features = list(interferon_markers), name = "interferon_score")
singlets <- AddModuleScore(singlets, features = list(interferonI_genes), name = "interferon1_score")
singlets <- AddModuleScore(singlets, features = list(interferonII_genes), name = "interferon2_score")
singlets <- AddModuleScore(singlets, features = list(interferonIII_genes), name = "interferon3_score")

plot5<-FeaturePlot_scCustom(singlets, features="interferon_score1", reduction="umap", colors_use= viridis_plasma_dark_high, na_color="lightgray", pt.size=0.5, na_cutoff = 0.1)&theme_bw() & theme()& labs(title = "Interferon signature")
## Warning: Some of the plotted features are from meta.data slot.
## • Please check that `na_cutoff` param is being set appropriately for those
##   features.
plot6<-FeaturePlot_scCustom(singlets, features="interferon1_score1", reduction="umap", colors_use= viridis_plasma_dark_high, na_color="lightgray", pt.size=0.5, na_cutoff = 0.1)&theme_bw() & theme()& labs(title = "Type I Interferon signature")
## Warning: Some of the plotted features are from meta.data slot.
## • Please check that `na_cutoff` param is being set appropriately for those
##   features.
plot7<-FeaturePlot_scCustom(singlets, features="interferon2_score1", reduction="umap", colors_use= viridis_plasma_dark_high, na_color="lightgray", pt.size=0.5, na_cutoff = 0.1)&theme_bw() & theme()& labs(title = "Type II Interferon signature")
## Warning: Some of the plotted features are from meta.data slot.
## • Please check that `na_cutoff` param is being set appropriately for those
##   features.
plot8<-FeaturePlot_scCustom(singlets, features="interferon3_score1", reduction="umap", colors_use= viridis_plasma_dark_high, na_color="lightgray", pt.size=0.5, na_cutoff = 0.1)&theme_bw() & theme()& labs(title = "Type III Interferon signature")
## Warning: Some of the plotted features are from meta.data slot.
## • Please check that `na_cutoff` param is being set appropriately for those
##   features.
plot5|plot6|plot7|plot8

gene_list_5 <- c("Zfp36l1", "Zfp36l2", "Ets2", "Arnt2", "Tshz2", "Irx3") 
gene_list_6 <- c("Klf12", "Aebp1", "Ebf1", "Mafa", "Nkx6-2", "Pou2af1") 
gene_list_7 <- c("Scx", "Cdkn1a", "Cdkn1c", "Ccnd2", "Cables1", "Cdk14")
gene_list_8 <- c("Rspo1", "Sfrp1", "Shisa4", "Shisal1")
FeaturePlot_scCustom(singlets, gene_list_5) + plot_annotation(title = "Gene list 1", theme = theme(plot.title = element_text(size = 20, hjust = 0.5)))&theme_bw() & theme()

FeaturePlot_scCustom(singlets, gene_list_6) + plot_annotation(title = "Gene list 1", theme = theme(plot.title = element_text(size = 20, hjust = 0.5)))&theme_bw() & theme()

FeaturePlot_scCustom(singlets, gene_list_7) + plot_annotation(title = "Gene list 1", theme = theme(plot.title = element_text(size = 20, hjust = 0.5)))&theme_bw() & theme()

FeaturePlot_scCustom(singlets, gene_list_8) + plot_annotation(title = "Gene list 1", theme = theme(plot.title = element_text(size = 20, hjust = 0.5)))&theme_bw() & theme()

##Odds ratio analysis We are calculating the probability of a cell being in a cluster based on whether the cell is WT versus KO.

sample_pheno_data <-  singlets@meta.data %>% select(HTO_classification, phenotype) %>% unique()
##the number of cells in each mouse in each cluster
Ncells <- table(singlets@meta.data$cell_type,
                     singlets@meta.data$HTO_classification) %>% as.data.frame()

colnames(Ncells) <- c("cluster_id", "HTO_classification", "Freq")
TotalCells <- Ncells %>%
  group_by(HTO_classification) %>%
  summarise(TotalCells = sum(Freq))

Ncells %<>% merge(., TotalCells)

Ncells %<>% merge(., sample_pheno_data)
Clusters <- unique(Ncells$cluster_id)
Nclusters <- length(Clusters)

##function to estimate the change in the odds of cluster membership from the 5 day to the 11 day time-point
estimateCellStateChange <- function(k, Ncells, TotalCells, SampleInfo) {
  require(lme4)
  require(gdata)
  print(paste("Cluster", k))
  Ncells_sub <- Ncells %>%
    filter(cluster_id==k)
  
  glmerFit <- glmer(cbind(Freq, (TotalCells - Freq)) ~ (1|HTO_classification) + phenotype, data=Ncells_sub, family = "binomial", control = glmerControl(optimizer = "bobyqa"))
  sglmerFit  <- summary(glmerFit)
  TempRes <- (sglmerFit$coefficients[-1,])

  return(TempRes)
}

ClusterRes <- sapply(Clusters, estimateCellStateChange, Ncells, TotalCells, SampleInfo)
## Loading required package: gdata
## Warning: package 'gdata' was built under R version 4.4.1
## Registered S3 method overwritten by 'gdata':
##   method         from  
##   reorder.factor gplots
## 
## Attaching package: 'gdata'
## The following objects are masked from 'package:data.table':
## 
##     first, last
## The following objects are masked from 'package:dplyr':
## 
##     combine, first, last, starts_with
## The following object is masked from 'package:stats':
## 
##     nobs
## The following object is masked from 'package:utils':
## 
##     object.size
## The following object is masked from 'package:base':
## 
##     startsWith
## [1] "Cluster mTEC3"
## boundary (singular) fit: see help('isSingular')
## [1] "Cluster mTEC2"
## [1] "Cluster mature cTEC"
## boundary (singular) fit: see help('isSingular')
## [1] "Cluster mTEC1"
## [1] "Cluster PNP"
## [1] "Cluster immature cTEC"
## [1] "Cluster Early progenitors"
## [1] "Cluster Transient amplifying progenitors"
## [1] "Cluster Residual"
## [1] "Cluster proliferating cTEC"
## boundary (singular) fit: see help('isSingular')
ClusterRes %<>% 
  as.data.frame() %>% 
  t() 
row.names(ClusterRes) <-  paste0("Cluster", Clusters)
ClusterRes <- data.frame(ClusterRes)
colnames(ClusterRes)[c(1,4)] <- c("logOddsRatio_WT_vs_KO","pvalue")

##perform multiple-testing correction
ClusterRes %<>% mutate(p.adjust = p.adjust(pvalue, method = "BH"))
print(ClusterRes)
##                                         logOddsRatio_WT_vs_KO Std..Error
## ClustermTEC3                                       -0.2821077 0.06454679
## ClustermTEC2                                       -0.4800539 0.09830424
## Clustermature cTEC                                  1.7507792 0.10025562
## ClustermTEC1                                       -0.7336324 0.10685314
## ClusterPNP                                         -0.9135195 0.08201985
## Clusterimmature cTEC                                1.4607543 0.23857779
## ClusterEarly progenitors                            1.3344321 0.18246794
## ClusterTransient amplifying progenitors            -0.6438046 0.13314435
## ClusterResidual                                    -1.4030929 0.25594961
## Clusterproliferating cTEC                           0.5908567 0.27612557
##                                            z.value       pvalue     p.adjust
## ClustermTEC3                             -4.370592 1.239102e-05 1.376780e-05
## ClustermTEC2                             -4.883349 1.042991e-06 1.489987e-06
## Clustermature cTEC                       17.463152 2.734109e-68 2.734109e-67
## ClustermTEC1                             -6.865800 6.611947e-12 1.652987e-11
## ClusterPNP                              -11.137786 8.213591e-29 4.106796e-28
## Clusterimmature cTEC                      6.122759 9.196872e-10 1.839374e-09
## ClusterEarly progenitors                  7.313242 2.607733e-13 8.692444e-13
## ClusterTransient amplifying progenitors  -4.835388 1.328863e-06 1.661078e-06
## ClusterResidual                          -5.481911 4.207568e-08 7.012614e-08
## Clusterproliferating cTEC                 2.139812 3.236999e-02 3.236999e-02
ggplot(Ncells, aes(x=phenotype, y=(Freq+1)/TotalCells)) +
  geom_boxplot() +
  geom_point() +
  scale_y_log10() +
  ylab("proportion of cells") +
  facet_grid(cols = vars(cluster_id)) + theme_bw()

singlets@meta.data$phenotype <- as.factor(singlets@meta.data$phenotype)
singlets@meta.data$HTO_classification <- as.factor(singlets@meta.data$HTO_classification)
singlets[["phenotype"]] <- (singlets@meta.data$phenotype) %>%
  as.character() %>%
  gsub(" ", "_", .) %>%
  make.names()
singlets$phenotype <- as.factor(singlets$phenotype)
# Identify clusters to keep
clusters_toKeep <- table(singlets@meta.data$cell_type, singlets@meta.data$phenotype) %>% 
  t() %>% 
  apply(., 2, function(x) median(x)) %>% 
  subset(., is_greater_than(., 100)) %>% 
  names()

# Aggregate expression data
pbData <- AggregateExpression(singlets, assays = "RNA", group.by = c("cell_type", "HTO_classification"))

pb_counts <- pbData$RNA %>% as.data.frame()

# Perform cluster-wise analysis
clusterwise_pseudobulk_expression_association <- function(cluster, count_data, sample_pheno_data) {
  
  print(paste("evaluating cluster", cluster))
  
  # Get cluster-specific counts
  cluster_count_data <- count_data %>% select(starts_with(paste0(cluster)))
  
  col_individuals <- sapply(colnames(cluster_count_data), function(x) strsplit(x, "_")[[1]][2])
  temp_indices <- match(col_individuals, sample_pheno_data$HTO_classification)
  sample_pheno_data <- sample_pheno_data %>% slice(temp_indices)
  
  # Set up the design matrix
  phenotype <- sample_pheno_data %>% mutate(phenotype = as.factor(phenotype)) %>% .$phenotype
  
  # Check if phenotype has at least two levels
  if (length(levels(phenotype)) < 1) {
    print(paste("Skipping cluster", cluster, "- not enough levels in phenotype"))
    return(NULL)
  }
  
  mm <- model.matrix(~ phenotype)
  colnames(mm) <- levels(phenotype)
  row.names(mm) <- colnames(cluster_count_data)

  y <- DGEList(counts = cluster_count_data, group = phenotype)

  # Filter out low expressed genes
  genes_toKeep <- filterByExpr(y, design = mm, min.count=2, min.prop=0.5)
  print(paste("no. of genes after filtering = ", sum(genes_toKeep)))
  y <- y[genes_toKeep,,keep.lib.sizes=FALSE]

  # Normalize for between sample differences
  y <- calcNormFactors(y)
  
  # Estimate dispersion
  y <- estimateDisp(y, design=mm, robust = TRUE) 
  
  # Get the Counts-Per-Million in order to visualize the clustering of the samples in a PCA plot
  cpm <- cpm(y)
  log2_cpm <- log2(cpm + 0.1*min(cpm[cpm > 0]))
  
  # Calculate log-CPM values with log transformation and prior count stabilization
  log_cpm_counts <- log2_cpm
  
  # Estimate variance of the expression of each gene/row
  variance_per_gene <- rowVars(log_cpm_counts)
  use_features_for_pca <- variance_per_gene %>% sort(., decreasing = TRUE) %>% names(.)
  
  # Pick the top 1000 most variable genes
  use_features_for_pca <- use_features_for_pca[1:1000]
 
  log_cpm_counts_t <- t(log_cpm_counts[row.names(log_cpm_counts) %in% use_features_for_pca, ])
  
  # Run PCA on the log-CPM data
  res.pca.norm <- prcomp(log_cpm_counts_t, scale = TRUE, rank.=10)
  
  # Create a data frame for visualization
  log_cpm_counts_PCs <- data.frame(sample_pheno_data, res.pca.norm$x)
  
  # Print the PCA plot
  print(ggplot(log_cpm_counts_PCs, aes(x = PC1, y = PC2, color = phenotype)) +
    geom_point(aes(size=0.1)) +
    theme_bw() +
    ggtitle(paste0("cluster_", cluster)) +
    theme(text = element_text(size = 16)))

  fit <- glmQLFit(y, design=mm)
  qlf <- glmQLFTest(fit, coef=2)
  topGenes <- topTags(qlf, n = nrow(y))
  
  diff_res <- data.frame(topGenes$table, cluster = rep(cluster, nrow(topGenes$table)))

  return(diff_res)
}

# Apply the function to all clusters to keep
all_diff_res <- lapply(clusters_toKeep, clusterwise_pseudobulk_expression_association, pb_counts, sample_pheno_data)
## [1] "evaluating cluster mTEC3"
## [1] "no. of genes after filtering =  18469"

## [1] "evaluating cluster mTEC2"
## [1] "no. of genes after filtering =  21152"

## [1] "evaluating cluster mature cTEC"
## [1] "no. of genes after filtering =  13917"

## [1] "evaluating cluster mTEC1"
## [1] "no. of genes after filtering =  15574"

## [1] "evaluating cluster PNP"
## [1] "no. of genes after filtering =  14756"

## [1] "evaluating cluster immature cTEC"
## [1] "no. of genes after filtering =  13770"

## [1] "evaluating cluster Early progenitors"
## [1] "no. of genes after filtering =  13322"

## [1] "evaluating cluster Transient amplifying progenitors"
## [1] "no. of genes after filtering =  14192"

all_diff_res %<>% do.call("rbind", .)

##add adjustment for all hypothesis tests - across all genes and all clusters
all_diff_res %<>% mutate(p.adj.global = p.adjust(PValue, "fdr"),
                         p.adj.local = FDR) %>%
  select(-FDR)
mutateddf <- mutate(all_diff_res, sig=ifelse(all_diff_res$p.adj.local<0.01 & (all_diff_res$logFC>1.2 | all_diff_res$logFC<(-1.2)), "Sig", "Not Sig"))
mutateddf <- rownames_to_column(mutateddf, var = "Gene")
mutateddf$cluster <- as.factor(mutateddf$cluster)
input_fg<-subset(mutateddf, sig == "Sig")

# Create volcano plots for each cluster

cluster_mTEC1_bg <- subset(mutateddf, cluster == "mTEC1")
cluster_mTEC1_fg <- subset(input_fg, cluster == "mTEC1")

plot_mTEC1 <- ggplot(cluster_mTEC1_bg, aes(logFC, -log10(p.adj.local))) +
    geom_point(color = "grey", alpha = 0.1) +
    geom_point(data = cluster_mTEC1_fg, aes(col = sig, alpha = 0.4)) +
    ggtitle(paste("mTEC1")) +
    theme_bw() +
    theme(axis.title.x = element_text(size = 14),
          axis.title.y = element_text(size = 14),
          legend.position = "none") +
    xlab("logFC") +
    ylab("-log10 q value") +
    geom_text_repel(data = head(cluster_mTEC1_fg, 20), aes(label = Gene), size = 3, 
                    box.padding = unit(0.1, "lines"),
                    point.padding = unit(0.1, "lines"))

cluster_mTEC2_bg <- subset(mutateddf, cluster == "mTEC2")
cluster_mTEC2_fg <- subset(input_fg, cluster == "mTEC2")

plot_mTEC2 <- ggplot(cluster_mTEC2_bg, aes(logFC, -log10(p.adj.local))) +
    geom_point(color = "grey", alpha = 0.1) +
    geom_point(data = cluster_mTEC2_fg, aes(col = sig, alpha = 0.4)) +
    ggtitle(paste("mTEC2")) +
    theme_bw() +
    theme(axis.title.x = element_text(size = 14),
          axis.title.y = element_text(size = 14),
          legend.position = "none") +
    xlab("logFC") +
    ylab("-log10 q value") +
    geom_text_repel(data = head(cluster_mTEC2_fg, 20), aes(label = Gene), size = 3, 
                    box.padding = unit(0.1, "lines"),
                    point.padding = unit(0.1, "lines"))

cluster_mTEC3_bg <- subset(mutateddf, cluster == "mTEC3")
cluster_mTEC3_fg <- subset(input_fg, cluster == "mTEC3")

plot_mTEC3 <- ggplot(cluster_mTEC3_bg, aes(logFC, -log10(p.adj.local))) +
    geom_point(color = "grey", alpha = 0.1) +
    geom_point(data = cluster_mTEC3_fg, aes(col = sig, alpha = 0.4)) +
    ggtitle(paste("mTEC3")) +
    theme_bw() +
    theme(axis.title.x = element_text(size = 14),
          axis.title.y = element_text(size = 14),
          legend.position = "none") +
    xlab("logFC") +
    ylab("-log10 q value") +
    geom_text_repel(data = head(cluster_mTEC3_fg, 20), aes(label = Gene), size = 3, 
                    box.padding = unit(0.1, "lines"),
                    point.padding = unit(0.1, "lines"))

cluster_PNP_bg <- subset(mutateddf, cluster == "PNP")
cluster_PNP_fg <- subset(input_fg, cluster == "PNP")

plot_PNP <- ggplot(cluster_PNP_bg, aes(logFC, -log10(p.adj.local))) +
    geom_point(color = "grey", alpha = 0.1) +
    geom_point(data = cluster_PNP_fg, aes(col = sig, alpha = 0.4)) +
    ggtitle(paste("PNP")) +
    theme_bw() +
    theme(axis.title.x = element_text(size = 14),
          axis.title.y = element_text(size = 14),
          legend.position = "none") +
    xlab("logFC") +
    ylab("-log10 q value") +
    geom_text_repel(data = head(cluster_PNP_fg, 20), aes(label = Gene), size = 3, 
                    box.padding = unit(0.1, "lines"),
                    point.padding = unit(0.1, "lines"))

cluster_mcTEC_bg <- subset(mutateddf, cluster == "mature cTEC")
cluster_mcTEC_fg <- subset(input_fg, cluster == "mature cTEC")

plot_mcTEC <- ggplot(cluster_mcTEC_bg, aes(logFC, -log10(p.adj.local))) +
    geom_point(color = "grey", alpha = 0.1) +
    geom_point(data = cluster_mcTEC_fg, aes(col = sig, alpha = 0.4)) +
    ggtitle(paste("mature cTEC")) +
    theme_bw() +
    theme(axis.title.x = element_text(size = 14),
          axis.title.y = element_text(size = 14),
          legend.position = "none") +
    xlab("logFC") +
    ylab("-log10 q value") +
    geom_text_repel(data = head(cluster_mcTEC_fg, 20), aes(label = Gene), size = 3, 
                    box.padding = unit(0.1, "lines"),
                    point.padding = unit(0.1, "lines"))

cluster_imcTEC_bg <- subset(mutateddf, cluster == "immature cTEC")
cluster_imcTEC_fg <- subset(input_fg, cluster == "immature cTEC")

plot_imcTEC <- ggplot(cluster_imcTEC_bg, aes(logFC, -log10(p.adj.local))) +
    geom_point(color = "grey", alpha = 0.1) +
    geom_point(data = cluster_imcTEC_fg, aes(col = sig, alpha = 0.4)) +
    ggtitle(paste("immature cTEC")) +
    theme_bw() +
    theme(axis.title.x = element_text(size = 14),
          axis.title.y = element_text(size = 14),
          legend.position = "none") +
    xlab("logFC") +
    ylab("-log10 q value") +
    geom_text_repel(data = head(cluster_imcTEC_fg, 20), aes(label = Gene), size = 3, 
                    box.padding = unit(0.1, "lines"),
                    point.padding = unit(0.1, "lines"))

cluster_EP_bg <- subset(mutateddf, cluster == "Early progenitors")
cluster_EP_fg <- subset(input_fg, cluster == "Early progenitors")

plot_EP <- ggplot(cluster_EP_bg, aes(logFC, -log10(p.adj.local))) +
    geom_point(color = "grey", alpha = 0.1) +
    geom_point(data = cluster_EP_fg, aes(col = sig, alpha = 0.4)) +
    ggtitle(paste("Early progenitors")) +
    theme_bw() +
    theme(axis.title.x = element_text(size = 14),
          axis.title.y = element_text(size = 14),
          legend.position = "none") +
    xlab("logFC") +
    ylab("-log10 q value") +
    geom_text_repel(data = head(cluster_EP_fg, 20), aes(label = Gene), size = 3, 
                    box.padding = unit(0.1, "lines"),
                    point.padding = unit(0.1, "lines"))

cluster_TAP_bg <- subset(mutateddf, cluster == "Transient amplifying progenitors")
cluster_TAP_fg <- subset(input_fg, cluster == "Transient amplifying progenitors")

plot_TAP <- ggplot(cluster_TAP_bg, aes(logFC, -log10(p.adj.local))) +
    geom_point(color = "grey", alpha = 0.1) +
    geom_point(data = cluster_TAP_fg, aes(col = sig, alpha = 0.4)) +
    ggtitle(paste("Transient amplifying progenitors")) +
    theme_bw() +
    theme(axis.title.x = element_text(size = 14),
          axis.title.y = element_text(size = 14),
          legend.position = "none") +
    xlab("logFC") +
    ylab("-log10 q value") +
    geom_text_repel(data = head(cluster_TAP_fg, 20), aes(label = Gene), size = 3, 
                    box.padding = unit(0.1, "lines"),
                    point.padding = unit(0.1, "lines"))

plot_mTEC1|plot_mTEC2|plot_mTEC3|plot_PNP
## Warning: ggrepel: 9 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

plot_mcTEC|plot_imcTEC|plot_TAP|plot_EP
## Warning: ggrepel: 9 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

DefaultAssay(singlets) <- "SCT"
p5 <- FeaturePlot_scCustom(singlets, features=c("Ackr4", "Dll4", "Psmb11", "Ccl21a", "Aire", "Fezf2"), reduction="umap", colors_use= viridis_plasma_dark_high, na_color="lightgray", pt.size=0.5, label=FALSE)& NoAxes() & NoLegend()
p5

wt1_data <- subset(x = singlets, subset = HTO_classification == "WT1")
wt2_data <- subset(x = singlets, subset = HTO_classification == "WT2")
wt3_data <- subset(x = singlets, subset = HTO_classification == "WT3")
ko1_data <- subset(x = singlets, subset = HTO_classification == "KO1")
ko2_data <- subset(x = singlets, subset = HTO_classification == "KO2")
ko3_data <- subset(x = singlets, subset = HTO_classification == "KO3")
gsva_wt1 <- analyse_sc_clusters(wt1_data, use_interactors = FALSE, verbose = TRUE)
## Calculating average cluster expression...
## Converting expression data to string... (This may take a moment)
## Conversion complete
## Submitting request to Reactome API...
## Compressing request data...
## Reactome Analysis submitted succesfully
## Converting dataset Seurat...
## Mapping identifiers...
## Performing gene set analysis using ssGSEA
## Analysing dataset 'Seurat' using ssGSEA
## Retrieving result...
gsva_wt2 <- analyse_sc_clusters(wt2_data, use_interactors = FALSE, verbose = TRUE)
## Calculating average cluster expression...
## Converting expression data to string... (This may take a moment)
## Conversion complete
## Submitting request to Reactome API...
## Compressing request data...
## Reactome Analysis submitted succesfully
## Converting dataset Seurat...
## Mapping identifiers...
## Performing gene set analysis using ssGSEA
## Analysing dataset 'Seurat' using ssGSEA
## Retrieving result...
gsva_wt3 <- analyse_sc_clusters(wt3_data, use_interactors = FALSE, verbose = TRUE)
## Calculating average cluster expression...
## Converting expression data to string... (This may take a moment)
## Conversion complete
## Submitting request to Reactome API...
## Compressing request data...
## Reactome Analysis submitted succesfully
## Converting dataset Seurat...
## Mapping identifiers...
## Performing gene set analysis using ssGSEA
## Analysing dataset 'Seurat' using ssGSEA
## Retrieving result...
gsva_ko1 <- analyse_sc_clusters(ko1_data, use_interactors = FALSE, verbose = TRUE)
## Calculating average cluster expression...
## Converting expression data to string... (This may take a moment)
## Conversion complete
## Submitting request to Reactome API...
## Compressing request data...
## Reactome Analysis submitted succesfully
## Converting dataset Seurat...
## Mapping identifiers...
## Performing gene set analysis using ssGSEA
## Analysing dataset 'Seurat' using ssGSEA
## Retrieving result...
gsva_ko2 <- analyse_sc_clusters(ko2_data, use_interactors = FALSE, verbose = TRUE)
## Calculating average cluster expression...
## Converting expression data to string... (This may take a moment)
## Conversion complete
## Submitting request to Reactome API...
## Compressing request data...
## Reactome Analysis submitted succesfully
## Converting dataset Seurat...
## Mapping identifiers...
## Performing gene set analysis using ssGSEA
## Analysing dataset 'Seurat' using ssGSEA
## Retrieving result...
gsva_ko3 <- analyse_sc_clusters(ko3_data, use_interactors = FALSE, verbose = TRUE)
## Calculating average cluster expression...
## Converting expression data to string... (This may take a moment)
## Conversion complete
## Submitting request to Reactome API...
## Compressing request data...
## Reactome Analysis submitted succesfully
## Converting dataset Seurat...
## Mapping identifiers...
## Performing gene set analysis using ssGSEA
## Analysing dataset 'Seurat' using ssGSEA
## Retrieving result...
pathway_wt1 <- pathways(gsva_wt1)
pathway_wt2 <- pathways(gsva_wt2)
pathway_wt3 <- pathways(gsva_wt3)

pathway_ko1 <- pathways(gsva_ko1)
pathway_ko2 <- pathways(gsva_ko2)
pathway_ko3 <- pathways(gsva_ko3)

colnames(pathway_wt1) <- gsub("\\.Seurat", "", colnames(pathway_wt1))
colnames(pathway_wt2) <- gsub("\\.Seurat", "", colnames(pathway_wt2))
colnames(pathway_wt3) <- gsub("\\.Seurat", "", colnames(pathway_wt3))

colnames(pathway_ko1) <- gsub("\\.Seurat", "", colnames(pathway_ko1))
colnames(pathway_ko2) <- gsub("\\.Seurat", "", colnames(pathway_ko2))
colnames(pathway_ko3) <- gsub("\\.Seurat", "", colnames(pathway_ko3))
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:gdata':
## 
##     starts_with
## The following object is masked from 'package:magrittr':
## 
##     extract
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
long_wt1 <- gather(pathway_wt1, cluster, score, Early_progenitors:Transient_amplifying_progenitors, factor_key=TRUE)
long_wt2 <- gather(pathway_wt2, cluster, score, Early_progenitors:Transient_amplifying_progenitors, factor_key=TRUE)
long_wt3 <- gather(pathway_wt3, cluster, score, Early_progenitors:Transient_amplifying_progenitors, factor_key=TRUE)

long_ko1 <- gather(pathway_ko1, cluster, score, Early_progenitors:Transient_amplifying_progenitors, factor_key=TRUE)
long_ko2 <- gather(pathway_ko2, cluster, score, Early_progenitors:Transient_amplifying_progenitors, factor_key=TRUE)
long_ko3 <- gather(pathway_ko3, cluster, score, Early_progenitors:Transient_amplifying_progenitors, factor_key=TRUE)

long_wt1$HTO <- 'WT1'
long_wt2$HTO <- 'WT2'
long_wt3$HTO <- 'WT3'

long_ko1$HTO <- 'KO1'
long_ko2$HTO <- 'KO2'
long_ko3$HTO <- 'KO3'


pathway <- rbind(long_wt1, long_wt2, long_wt3, long_ko1, long_ko2, long_ko3)
pathway_wide <- spread(pathway, HTO, score)
pathway_wide<- pathway_wide[complete.cases(pathway_wide), ]
pathway_wide <- subset(pathway_wide, pathway_wide$cluster != "Residual")
pathway_long <- gather(pathway_wide, HTO, score, KO1:WT3, factor_key=TRUE)

hto_phenotype_map <- data.frame(
  HTO = c("WT1", "WT2", "WT3", "KO1", "KO2", "KO3"),
  phenotype = c("WT", "WT", "WT", "KO", "KO", "KO")
)

df <- pathway_long %>%
  left_join(hto_phenotype_map, by = "HTO")

long_wt <- gather(pathway_wt, cluster, score, Early_progenitors:Transient_amplifying_progenitors, factor_key=TRUE)

calculate_p_values <- function(df) {
  p_values_df <- df %>%
    group_by(Name, cluster) %>%
    summarize(p_value = t.test(score[phenotype == "WT"], score[phenotype == "KO"])$p.value) %>%
    ungroup()

  return(p_values_df)
}

p_values <- calculate_p_values(df)
## `summarise()` has grouped output by 'Name'. You can override using the
## `.groups` argument.
print(p_values)
## # A tibble: 17,469 Ă— 3
##    Name                                                          cluster p_value
##    <chr>                                                         <fct>     <dbl>
##  1 5-Phosphoribose 1-diphosphate biosynthesis                    Early_…  0.0414
##  2 5-Phosphoribose 1-diphosphate biosynthesis                    immatu…  0.268 
##  3 5-Phosphoribose 1-diphosphate biosynthesis                    mature…  0.720 
##  4 5-Phosphoribose 1-diphosphate biosynthesis                    mTEC1    0.556 
##  5 5-Phosphoribose 1-diphosphate biosynthesis                    mTEC2    0.535 
##  6 5-Phosphoribose 1-diphosphate biosynthesis                    mTEC3    0.0352
##  7 5-Phosphoribose 1-diphosphate biosynthesis                    PNP      0.372 
##  8 5-Phosphoribose 1-diphosphate biosynthesis                    prolif…  0.770 
##  9 5-Phosphoribose 1-diphosphate biosynthesis                    Transi…  0.182 
## 10 A tetrasaccharide linker sequence is required for GAG synthe… Early_…  0.767 
## # ℹ 17,459 more rows
library(dplyr)

mean_df <- df %>%
    group_by(Name, cluster, phenotype) %>%
    summarize(mean_score = mean(score, na.rm = TRUE), .groups = "drop") %>%
    pivot_wider(names_from = phenotype, values_from = mean_score)

combined_df <- dplyr::left_join(mean_df, p_values, by = c("Name", "cluster"))
sig_df <- subset(combined_df, p_value< 0.01)
sig_df$diff <- sig_df$KO - sig_df$WT
# Assuming your data frame is called 'df'
# and the columns are named 'KO', 'WT', 'difference', and 'cluster'

result_list <- list() # Initialize an empty list to store results

unique_clusters <- unique(sig_df$cluster) # Get unique cluster values

for (cluster_val in unique_clusters) {
  cluster_df <- sig_df[sig_df$cluster == cluster_val, ] # Subset data for the current cluster

  if (nrow(cluster_df) > 0) { # Check if there are any rows for the current cluster
    cluster_df_ordered <- cluster_df[order(cluster_df$diff), ] # Order by difference

    top3 <- tail(cluster_df_ordered, 3)
    bottom3 <- head(cluster_df_ordered, 3)

    result_list[[cluster_val]] <- rbind(bottom3, top3) # Store results in the list, including all columns
  }else{
    result_list[[cluster_val]] <- NULL # if no rows for the cluster, store NULL.
  }
}

# result_list now contains a list of data frames,
# where each element is the top 3 and bottom 3 rows
# for a specific cluster.

# Example of accessing the result for cluster "A":
# result_list$A

#If you want to combine everything into one dataframe, but keep track of the cluster.
combined_df <- do.call(rbind, lapply(names(result_list), function(name) {
  if(!is.null(result_list[[name]])){
    cbind(result_list[[name]], cluster_original = name) # Retains all original columns
  }
}))
plot <- ggplot(
  combined_df %>%
    pivot_longer(cols = c("WT", "KO"), names_to = "phenotype", values_to = "mean_score") %>%
    group_by(cluster_original) %>%
    mutate(
      Name = factor(
        Name,
        levels = unique((.) %>% filter(phenotype == "KO") %>% arrange(desc(mean_score)) %>% pull(Name))
      )
    ),
  aes(y = Name, x = mean_score, fill = phenotype)
) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ cluster_original, scales = "free_y") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = "Mean Pathway Scores by Cluster and Phenotype",
    x = "Mean Score",
    y = "Name",
    fill = "Phenotype"
  )

plot

sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.3.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Toronto
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tidyr_1.3.1        gdata_3.0.1        ReactomeGSA_1.18.0 decoupleR_2.10.0  
##  [5] SCpubr_2.0.2       openxlsx_4.2.8     HGNChelper_0.8.15  ggrepel_0.9.6     
##  [9] tibble_3.2.1       matrixStats_1.5.0  edgeR_4.2.2        limma_3.60.6      
## [13] presto_1.0.0       data.table_1.17.0  Rcpp_1.0.14        scCustomize_3.0.1 
## [17] magrittr_2.0.3     lme4_1.1-36        Matrix_1.7-3       glmGamPoi_1.16.0  
## [21] ggplot2_3.5.1      patchwork_1.3.0    sctransform_0.4.1  Seurat_5.2.1      
## [25] SeuratObject_5.0.2 sp_2.2-0           dplyr_1.1.4       
## 
## loaded via a namespace (and not attached):
##   [1] fs_1.6.5                    spatstat.sparse_3.1-0      
##   [3] bitops_1.0-9                lubridate_1.9.4            
##   [5] httr_1.4.7                  RColorBrewer_1.1-3         
##   [7] tools_4.4.0                 utf8_1.2.4                 
##   [9] R6_2.6.1                    lazyeval_0.2.2             
##  [11] uwot_0.2.3                  withr_3.0.2                
##  [13] prettyunits_1.2.0           gridExtra_2.3              
##  [15] progressr_0.15.1            cli_3.6.4                  
##  [17] Biobase_2.64.0              spatstat.explore_3.4-2     
##  [19] fastDummies_1.7.5           labeling_0.4.3             
##  [21] sass_0.4.9                  prismatic_1.1.2            
##  [23] spatstat.data_3.1-6         ggridges_0.5.6             
##  [25] pbapply_1.7-2               yulab.utils_0.2.0          
##  [27] R.utils_2.13.0              parallelly_1.42.0          
##  [29] rstudioapi_0.17.1           generics_0.1.3             
##  [31] gridGraphics_0.5-1          shape_1.4.6.1              
##  [33] gtools_3.9.5                ica_1.0-3                  
##  [35] spatstat.random_3.3-3       zip_2.3.2                  
##  [37] ggbeeswarm_0.7.2            S4Vectors_0.42.1           
##  [39] abind_1.4-8                 R.methodsS3_1.8.2          
##  [41] lifecycle_1.0.4             yaml_2.3.10                
##  [43] snakecase_0.11.1            SummarizedExperiment_1.34.0
##  [45] gplots_3.2.0                SparseArray_1.4.8          
##  [47] Rtsne_0.17                  paletteer_1.6.0            
##  [49] grid_4.4.0                  promises_1.3.2             
##  [51] crayon_1.5.3                miniUI_0.1.1.1             
##  [53] lattice_0.22-6              cowplot_1.1.3              
##  [55] pillar_1.10.1               knitr_1.50                 
##  [57] GenomicRanges_1.56.2        boot_1.3-31                
##  [59] future.apply_1.11.3         codetools_0.2-20           
##  [61] glue_1.8.0                  spatstat.univar_3.1-2      
##  [63] vctrs_0.6.5                 png_0.1-8                  
##  [65] spam_2.11-1                 Rdpack_2.6.3               
##  [67] gtable_0.3.6                assertthat_0.2.1           
##  [69] rematch2_2.1.2              cachem_1.1.0               
##  [71] xfun_0.51                   rbibutils_2.3              
##  [73] S4Arrays_1.4.1              mime_0.13                  
##  [75] reformulas_0.4.0            survival_3.8-3             
##  [77] statmod_1.5.0               fitdistrplus_1.2-2         
##  [79] ROCR_1.0-11                 nlme_3.1-167               
##  [81] progress_1.2.3              RcppAnnoy_0.0.22           
##  [83] GenomeInfoDb_1.40.1         bslib_0.9.0                
##  [85] irlba_2.3.5.1               vipor_0.4.7                
##  [87] KernSmooth_2.23-26          splitstackshape_1.4.8      
##  [89] colorspace_2.1-1            BiocGenerics_0.50.0        
##  [91] ggrastr_1.0.2               tidyselect_1.2.1           
##  [93] compiler_4.4.0              curl_6.2.1                 
##  [95] DelayedArray_0.30.1         plotly_4.10.4              
##  [97] scales_1.3.0                caTools_1.18.3             
##  [99] lmtest_0.9-40               stringr_1.5.1              
## [101] digest_0.6.37               goftest_1.2-3              
## [103] spatstat.utils_3.1-3        minqa_1.2.8                
## [105] rmarkdown_2.29              XVector_0.44.0             
## [107] htmltools_0.5.8.1           pkgconfig_2.0.3            
## [109] sparseMatrixStats_1.16.0    MatrixGenerics_1.16.0      
## [111] fastmap_1.2.0               rlang_1.1.5                
## [113] GlobalOptions_0.1.2         htmlwidgets_1.6.4          
## [115] UCSC.utils_1.0.0            shiny_1.10.0               
## [117] DelayedMatrixStats_1.26.0   farver_2.1.2               
## [119] jquerylib_0.1.4             zoo_1.8-13                 
## [121] jsonlite_1.9.1              BiocParallel_1.38.0        
## [123] R.oo_1.27.0                 GenomeInfoDbData_1.2.12    
## [125] ggplotify_0.1.2             dotCall64_1.2              
## [127] munsell_0.5.1               viridis_0.6.5              
## [129] reticulate_1.41.0.1         stringi_1.8.4              
## [131] zlibbioc_1.50.0             MASS_7.3-65                
## [133] plyr_1.8.9                  parallel_4.4.0             
## [135] listenv_0.9.1               forcats_1.0.0              
## [137] deldir_2.0-4                splines_4.4.0              
## [139] tensor_1.5                  hms_1.1.3                  
## [141] circlize_0.4.16             locfit_1.5-9.12            
## [143] igraph_2.1.4                spatstat.geom_3.3-6        
## [145] RcppHNSW_0.6.0              reshape2_1.4.4             
## [147] stats4_4.4.0                evaluate_1.0.3             
## [149] ggprism_1.0.5               nloptr_2.2.1               
## [151] httpuv_1.6.15               RANN_2.6.2                 
## [153] purrr_1.0.4                 polyclip_1.10-7            
## [155] future_1.34.0               scattermore_1.2            
## [157] janitor_2.2.1               xtable_1.8-4               
## [159] RSpectra_0.16-2             later_1.4.1                
## [161] viridisLite_0.4.2           beeswarm_0.4.0             
## [163] IRanges_2.38.1              cluster_2.1.8.1            
## [165] timechange_0.3.0            globals_0.16.3